"""
CountVectorizer: 词频统计，将文本转换为词频矩阵
英文：天生空格分隔
中文：需要分词
"""
from sklearn.feature_extraction.text import CountVectorizer
import jieba

def cut_word_eng_demo():
    data = ['life is short,i like like python',
            'life is too long,i dislike python']
    # 初始化特征提取器
    transfer = CountVectorizer()
    # 调用fit_transform，应用特征提取器
    data_new = transfer.fit_transform(data)
    # 输出结果
    print("data_new:\n", data_new.toarray())
    print("特征名字：\n", transfer.get_feature_names_out())

    return None

def cut_word_ch_demo():

    data = ["我爱北京天安门",
            "天安门上太阳升"]
    data_new = []    # 用于存储分词后的结果
    for sent in data:
        # 对每个句子进行分词
        data_new.append(" ".join(jieba.cut(sent)))
    print("分词后的数据:\n", data_new)

    # 文本特征提取器流程
    transfer = CountVectorizer()
    data_final = transfer.fit_transform(data_new)
    print("data_final:\n", data_final.toarray())
    print("特征名：\n", transfer.get_feature_names_out())

    return None

if  __name__ == '__main__':
    cut_word_eng_demo()
    print("-"*50)
    cut_word_ch_demo()